
###########################################
#### 03_create_juror_data.R ###############
##########################################

# SETUP
# limit scientific notation
options(scipen=10)
# load packages
library(pacman)
p_load(dplyr, ggplot2, here, estimatr, tidyr, stringr, modelsummary)
i_am("Code/03_create_juror_data.R")
source(here("Code", "word_to_number.R"))

#

# load data ####

# juror-level data on jury speech
load(here("Data", "juror_level.RData"))
# juror-level speech
speech <- read.csv(here("Data", "jurorspeechtrial.csv"))
# jury-level speech info
jury <- read.csv(here("Data", "jury_level.csv"))
# transcript-level speech info
transcript <- read.csv(here("Data", "transcript_details.csv"))

# merge data sources
dat <- left_join(juror %>% select(-Identifier), 
                 speech %>% select(case_id, Identifier, jurortext, age, income, sex.x, educ, jurortext_rd1,jurortext_rd2), 
                 by="case_id") %>%
  rename("sex"=sex.x)
dat <- dat %>% left_join(jury %>% select(jurynum, cant_code, attempt, no_ids), by="jurynum")

transcript$transcript_id_numbers <- as.numeric(transcript$transcriptnumbers)
dat <- left_join(dat, transcript %>% 
                   select(transcript_id_numbers, turns, words, turns_attributed, words_attributed) %>% 
                   mutate(transcript_id_numbers = as.character(transcript_id_numbers)))

# calculate transcript-level statistics
dat$jury_prop_words_attributed <- dat$words_attributed/dat$words
dat$speechlength <- str_count(dat$jurortext, "\\S+")
dat$prop_attributed_words <- dat$speechlength/dat$words_attributed

# speakers: people identified, on juries with at least 4 identified
speakers <- dat %>% filter(!is.na(Identifier)) %>% filter(no_ids<5) %>% filter(is.na(cant_code))

# separate by round and code new variables 
tab1 <- dat %>%
  select(jurortext_rd1, jurortext_rd2, case_id, jury_prop_words_attributed) %>%
  pivot_longer(cols=c(jurortext_rd1, jurortext_rd2), names_to="round", values_to="text", names_prefix = "jurortext_rd") %>%
  filter(!is.na(text)) %>%
  left_join(dat %>% select(case_id, white, idoll, iscale, order, scenario, jurynum, dissenter_scale, dissenter_doll, side_doll, side_scale) %>% distinct()) %>%
  mutate(type = case_when(order=="A"&round==1~"Dollars",
                          order=="A"&round==2~"Scale",
                          order=="B"&round==1~"Scale",
                          order=="B"&round==2~"Dollars")) %>%
  mutate(dissenter = case_when(type=="Dollars"&dissenter_doll==TRUE~1,
                               type=="Scale"&dissenter_scale==TRUE~1,
                               T~0)) %>%
  mutate(diss_high = case_when(type=="Dollars"&side_doll=="higher"~1,
                               type=="Scale"&side_scale=="higher"~1,
                               T~0))
tab1$length <- str_count(tab1$text, "\\S+")



# add juror speech-level preference mentions #### 

#first, turn scale point names into numbers

tab1$text <- str_replace(tab1$text, "none|None", "0")
tab1$text <- str_replace(tab1$text, "mild|Mild", "2 ")
tab1$text <- str_replace(tab1$text, "substantial|Substantial", "4 ")
tab1$text <- str_replace(tab1$text, "extremely severe|Extremely severe", "8 ")
tab1$text <- str_replace(tab1$text, "severe|Severe", "6 ")


tab1 <- tab1 %>% ungroup() %>% filter(!is.na(tab1$text))

# remove problematic text
tab1$text <- str_remove(tab1$text, "nine hundred thousand million")
tab1$text <- iconv(tab1$text, from = 'UTF-8', to = 'ASCII//TRANSLIT')

# locate preference mentions 
tab1$nums <- lapply(tab1$text, find_and_replace)
tab1$nums_loc <- lapply(tab1$text, function(x) find_and_replace(x, ret="loc"))

# convert to numeric
tab1$nums <- lapply(tab1$nums, as.integer)
tab1$nums_loc <- lapply(tab1$nums_loc, as.integer)

# remove non-standard assignments
tab1 <- tab1 %>% filter(!is.na(order))

#remove 100 mil & 200 mil 
rem1 <- lapply(tab1$nums, function(x) !(x %in% c(200000, 100000000, 200000000)))

# and transform numbers that likely have different magnitudes
smalls <- NA
for(i in 1:nrow(tab1)){
  tab1$nums[i] <- list(tab1$nums[[i]][which(rem1[[i]])])
  tab1$nums_loc[i] <- list(tab1$nums_loc[[i]][which(rem1[[i]])])
  rm(smalls)
      smalls <- data.frame("small"=unlist(tab1$nums[[i]]))
      smalls$thou <- smalls$small*1000 
      smalls$mill <- smalls$small*1000000
      smalls$thou_in <- smalls$thou %in% unlist(tab1$nums[[i]])
      smalls$mill_in <- smalls$mill %in% unlist(tab1$nums[[i]])
      smalls$rep <- ifelse(smalls$thou_in==TRUE, smalls$thou,
                           ifelse(smalls$mill_in==TRUE, smalls$mill, smalls$small))
      tab1$nums[[i]] <- list(smalls$rep)
}
tab1$nums <- lapply(tab1$nums, unlist)

# pull in jury pref mentions/text
load(here("Data", "jury_with_numbers.RData"))

# merge jury scale and dollar preferences
tab1 <- left_join(tab1, dat %>% select(jurynum, doll_prefs, scale_prefs))
tab1 <- tab1 %>% mutate(prefs = case_when(type=="Dollars"~doll_prefs,
                                          type=="Scale"~scale_prefs))
# keep only prefs that belong to a jury member
tab1$prefs <- str_split(tab1$prefs, " ")
for(i in 1:nrow(tab1)){
  keep <- c(unlist(tab1$prefs[[i]]))
  tab1$nums[[i]] <- list(unlist(tab1$nums[[i]])[which(unlist(tab1$nums[[i]]) %in% keep)])
  tab1$nums_loc[[i]] <- list(unlist(tab1$nums_loc[[i]])[which(unlist(tab1$nums[[i]]) %in% keep)])
}

# calculate: 
  # how often does each juror mention their own pref?
  # how often do other jurors on the jury mention their pref?

# code own preference
tab1 <- tab1 %>% 
  mutate(own_pref = case_when(type=="Dollars"~as.integer(idoll),
                              type=="Scale"~as.integer(iscale)))
# count total number of pref mentions 
tab1$total_pref_mentions <- unlist(lapply(tab1$nums, function(x) length(unlist(x))))

# count own and others' mentions
tab1$total_other_pref_mentions <- NA
for(i in 1:nrow(tab1)){
  #mentions of own pref (doll and scale)
  tab1$own_pref_mentions[i] <- sum(unlist(tab1$nums[[i]])==tab1$own_pref[i])
  #others' mentions of pref
  tab1$other_pref_mentions[i] <-  sum(unlist(tab1$nums[tab1$jurynum==tab1$jurynum[i]&tab1$case_id!=tab1$case_id[i]&tab1$type==tab1$type[i]])==tab1$own_pref[i])
  #total prefs others mention
  tab1$total_other_pref_mentions[i] <- sum(tab1$total_pref_mentions[tab1$jurynum==tab1$jurynum[i]&tab1$case_id!=tab1$case_id[i]&tab1$type==tab1$type[i]])
}



tab1 <- left_join(tab1, juror %>% select(case_id, Identifier))

# count foreperson mentions of pref
tab1$fore_pref_mentions <- NA
tab1$fore_total_mentions <- NA
for(i in 1:nrow(tab1)){
  fore <- tab1$case_id[tab1$jurynum==tab1$jurynum[i]&tab1$round==tab1$round[i]&(tab1$Identifier %in% c("Foreman", "Forewoman"))]
  if(length(fore)>0){
    tab1$fore_pref_mentions[i] <- sum(unlist(tab1$nums[tab1$case_id==fore&tab1$round==tab1$round[i]])==tab1$own_pref[i])
    tab1$fore_total_mentions[i] <- tab1$total_pref_mentions[tab1$case_id==fore&tab1$round==tab1$round[i]]
  }
}




# adding speech timing

tab1 <- left_join(tab1, dat %>% select(jurynum, transcript_id, transcript_id_numbers))

# list of files with transcripts by speaking turn
sheets <- as.data.frame(list.files(here("DOCX Documents", "csv_speakingturns")))
colnames(sheets) <- "transcript_id"

# clean names for merging 
sheets$transcript_id_numbers<-str_remove_all(str_remove(sheets$transcript_id, "Corr2|corr2"),"[qwertyuioplkjhgfdsazxcvbnmQWERTYUIOPLKJHGFDSAZXCVBNM:.-]")
sheets$transcript_id_numbers<-str_remove_all(sheets$transcript_id_numbers," ")

# in each sheet, count the first turn and last turn for each juror, plus record total turns
tab1 <- tab1 %>% 
  mutate(first_turn = NA,
         last_turn = NA,
         total_turns=NA)
for(i in 1:nrow(tab1)){
  turns <- read.csv(here("DOCX Documents", "csv_speakingturns", sheets$transcript_id[sheets$transcript_id_numbers==tab1$transcript_id_numbers[i]][1]))
  spot <- which(grepl("Envelope #3|Envelope # 3|E nvelope #3|Envelope 3|Envelope  3|E n velope # 3|Envelope  # 3|Envelope  #3", turns$juror))
  if(tab1$round[i]==1){
  r <- turns[c(1:spot),]
  }
  if(tab1$round[i]==2){
  r <- turns[c(spot:nrow(turns)),]
  }
  r$juror <- str_remove_all(toupper(r$juror), " ")
  id2 <- str_remove_all(toupper(tab1$Identifier[i]), " ")
  if(id2 %in% r$juror){
  tab1$first_turn[i] <- which(r$juror==id2)[1]
  tab1$last_turn[i] <- which(r$juror==id2)[length(which(r$juror==id2))]
  tab1$total_turns[i] <- nrow(r)
  }
}


# save data
save(tab1, file=here("Data", "juror_mentions_byround.RData"))


